In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
pd.set_option('display.max_columns', None)

data = pd.read_excel('nba_player_data.xlsx')
In [2]:
data.sample(10)
Out[2]:
Unnamed: 0 Year Season_type PLAYER_ID RANK PLAYER TEAM_ID TEAM GP MIN FGM FGA FG_PCT FG3M FG3A FG3_PCT FTM FTA FT_PCT OREB DREB REB AST STL BLK TOV PF PTS EFF AST_TOV STL_TOV
2407 260 2020-21 Regular%20Season 1630181 260 R.J. Hampton 1610612753 ORL 51 888 136 313 0.435 27 87 0.310 55 82 0.671 29 150 179 87 21 10 53 61 354 394 1.64 0.40
1168 60 2022-23 Playoffs 1626157 61 Karl-Anthony Towns 1610612750 MIN 5 180 32 70 0.457 6 24 0.250 21 28 0.750 6 45 51 10 3 4 18 21 91 96 0.56 0.17
3328 402 2019-20 Regular%20Season 201587 403 Nicolas Batum 1610612766 CHA 22 505 28 81 0.346 14 49 0.286 9 10 0.900 25 75 100 66 17 8 22 41 79 194 3.00 0.77
3390 464 2019-20 Regular%20Season 1629718 464 Charlie Brown Jr. 1610612737 ATL 10 40 6 19 0.316 3 9 0.333 5 5 1.000 2 2 4 2 2 2 3 3 20 14 0.67 0.67
3589 134 2019-20 Playoffs 1627884 135 Derrick Jones Jr. 1610612748 MIA 15 97 8 17 0.471 4 9 0.444 2 5 0.400 6 6 12 7 6 5 2 12 22 38 3.50 3.00
3623 168 2019-20 Playoffs 1629598 169 Chris Clemons 1610612745 HOU 2 8 2 5 0.400 2 5 0.400 1 2 0.500 0 2 2 0 0 0 0 0 7 5 0.00 0.00
655 86 2022-23 Regular%20Season 1630180 87 Saddiq Bey 1610612737 ATL 77 2129 353 836 0.422 151 418 0.361 205 238 0.861 98 264 362 118 70 13 71 123 1062 1038 1.66 0.99
465 465 2023-24 Regular%20Season 1641720 465 Jalen Hood-Schifino 1610612747 LAL 21 109 10 45 0.222 2 15 0.133 12 20 0.600 2 11 13 8 3 2 9 13 34 8 0.89 0.33
2078 148 2021-22 Playoffs 1627854 148 Bryn Forbes 1610612743 DEN 5 76 6 15 0.400 4 11 0.364 4 5 0.800 1 2 3 7 1 0 2 7 20 19 3.50 0.50
3360 434 2019-20 Regular%20Season 1629662 435 Mfiondu Kabengele 1610612746 LAC 12 64 14 32 0.438 9 20 0.450 5 5 1.000 1 10 11 2 2 2 2 9 42 39 1.00 1.00
In [3]:
data.shape
Out[3]:
(3672, 31)
In [4]:
# Data cleaning & analysis preparation
In [5]:
data.isna().sum()
Out[5]:
Unnamed: 0     0
Year           0
Season_type    0
PLAYER_ID      0
RANK           0
PLAYER         0
TEAM_ID        0
TEAM           0
GP             0
MIN            0
FGM            0
FGA            0
FG_PCT         0
FG3M           0
FG3A           0
FG3_PCT        0
FTM            0
FTA            0
FT_PCT         0
OREB           0
DREB           0
REB            0
AST            0
STL            0
BLK            0
TOV            0
PF             0
PTS            0
EFF            0
AST_TOV        0
STL_TOV        0
dtype: int64
In [6]:
data.drop(columns=['RANK','EFF'], inplace=True)
In [7]:
data['season_start_year'] = data['Year'].astype(str).str[:4].astype(int)
In [8]:
data.TEAM.nunique()
Out[8]:
30
In [9]:
# Assuming 'data' is your DataFrame
# Extract the numerical part of the 'Year' column
data['Year'] = data['Year'].str.extract('(\d+)')

# Convert the extracted numerical part to float
data['Year'] = data['Year'].astype(float)
In [10]:
data['Season_type'].replace('Regular%20Season','RS', inplace=True)
In [11]:
rs_df = data[data['Season_type']=='RS']
playoffs_df = data[data['Season_type']=='Playoffs']
In [12]:
data.columns 
Out[12]:
Index(['Unnamed: 0', 'Year', 'Season_type', 'PLAYER_ID', 'PLAYER', 'TEAM_ID',
       'TEAM', 'GP', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT',
       'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS', 'AST_TOV', 'STL_TOV', 'season_start_year'],
      dtype='object')
In [13]:
total_cols = ['MIN', 'FGM', 'FGA', 'FG3M', 'FG3A','FTM', 'FTA', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS'] 
In [14]:
# Which player stats are correlated with each other
In [ ]:
 
In [15]:
data_per_min = data.groupby(['PLAYER','PLAYER_ID','Year'])[total_cols].sum().reset_index()
for col in data_per_min.columns [4:]:
    data_per_min[col] = data_per_min[col]/data_per_min['MIN']
    
data_per_min['FG%'] = data_per_min['FGM']/data_per_min['FGA']
data_per_min['3PT%'] = data_per_min['FG3M']/data_per_min['FG3A']
data_per_min['FT%'] = data_per_min['FTM']/data_per_min['FTA']
data_per_min['FG3A%'] = data_per_min['FG3A']/data_per_min['FGA']
data_per_min['PTS/FGA'] = data_per_min['PTS']/data_per_min['FGA']
data_per_min['FG3M/FGM'] = data_per_min['FG3M']/data_per_min['FGM']
data_per_min['FTA/FGA'] = data_per_min['FTA']/data_per_min['FGA']
data_per_min['TRU%'] = 0.5*data_per_min['PTS']/(data_per_min['FGA']+0.475*data_per_min['FTA'])
data_per_min['AST_TOV'] = data_per_min['AST']/data_per_min['TOV']

data_per_min = data_per_min[data_per_min['MIN']>=100]
data_per_min.drop(columns='PLAYER_ID', inplace=True)
data_per_min.drop(columns='PLAYER', inplace=True)
data_per_min.drop(columns='Year', inplace=True)

fig = px.imshow(data_per_min.corr())
fig.show()
In [16]:
(data_per_min['MIN']>=100).mean()
Out[16]:
1.0
In [17]:
#How are minutes played distributed
In [18]:
fig = px.histogram(x=rs_df['MIN'], histnorm='percent')
fig.show()
In [19]:
# Minutes Comparison
def hist_data(df=rs_df, min_MIN=0, min_GP=0):
    return df.loc[(df['MIN']>=min_MIN) & (df['GP']>min_GP), 'MIN']/\
    df.loc[(df['MIN']>=min_MIN) & (df['GP']>min_GP), 'GP']

                                   
In [20]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=hist_data(rs_df,50,5), histnorm='percent', name='RS',
                           xbins={'start':0,'end':48,'size':1}))
fig.add_trace(go.Histogram(x=hist_data(playoffs_df,5,1), histnorm='percent',
                           name='Playoffs', xbins={'start':0,'end':48,'size':1}))

fig.update_traces(opacity=0.5)
fig.show()
In [21]:
((hist_data(playoffs_df,5,1)>=12)&(hist_data(playoffs_df,5,1)<=34)).mean()
Out[21]:
0.46973365617433416
In [22]:
# Points Comparison

def hist_data(df=rs_df, min_MIN=0, min_GP=0):
    return df.loc[(df['MIN']>=min_MIN) & (df['GP']>min_GP), 'PTS']/\
    df.loc[(df['MIN']>=min_MIN) & (df['GP']>min_GP), 'GP']
In [23]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=hist_data(rs_df,50,5), histnorm='percent', name='RS',
                           xbins={'start':0,'end':48,'size':1}))
fig.add_trace(go.Histogram(x=hist_data(playoffs_df,5,1), histnorm='percent',
                           name='Playoffs', xbins={'start':0,'end':48,'size':1}))

fig.update_traces(opacity=0.5)
fig.show()
In [24]:
# How has the game changed over the past 5 years?
In [25]:
change_df = data.groupby('season_start_year')[total_cols].sum().reset_index()
change_df['POSS_est'] = change_df['FGA']-change_df['OREB']+change_df['TOV']+0.44*change_df['FTA']
change_df = change_df[list(change_df.columns[0:2])+['POSS_est']+list(change_df.columns[2:-1])]

change_df['FG%'] = change_df['FGM']/change_df['FGA']
change_df['3PT%'] = change_df['FG3M']/change_df['FG3A']
change_df['FT%'] = change_df['FTM']/change_df['FTA']
change_df['FG3A%'] = change_df['FG3A']/change_df['FGA']
change_df['PTS/FGA'] = change_df['PTS']/change_df['FGA']
change_df['FG3M/FGM'] = change_df['FG3M']/change_df['FGM']
change_df['FTA/FGA'] = change_df['FTA']/change_df['FGA']
change_df['TRU%'] = 0.5*change_df['PTS']/(change_df['FGA']+0.475*change_df['FTA'])
change_df['AST_TOV'] = change_df['AST']/change_df['TOV']

change_df
Out[25]:
season_start_year MIN POSS_est FGM FGA FG3M FG3A FTM FTA OREB DREB REB AST STL BLK TOV PF PTS FG% 3PT% FT% FG3A% PTS/FGA FG3M/FGM FTA/FGA TRU% AST_TOV
0 2019 552262 234384.64 92997 202223 28032 78279 40949 52906 22802 79318 102120 55445 17368 11085 31685 47615 254975 0.459874 0.358104 0.773995 0.387092 1.260861 0.301429 0.261622 0.560746 1.749882
1 2020 562518 235759.48 95849 205754 29549 80653 39624 50917 22918 80151 103069 57311 17491 11272 30520 45152 260871 0.465843 0.366372 0.778208 0.391988 1.267878 0.308287 0.247465 0.567260 1.877818
2 2021 635572 264004.96 106569 231293 32733 92552 44740 57709 27052 89602 116654 64618 20006 12387 34372 52038 290611 0.460753 0.353671 0.775269 0.400150 1.256463 0.307153 0.249506 0.561665 1.879960
3 2022 635386 266600.04 110010 231870 32382 89926 48136 61516 27403 86695 114098 66265 19078 12250 35066 52438 300538 0.474447 0.360096 0.782496 0.387829 1.296149 0.294355 0.265304 0.575545 1.889722
4 2023 566250 236554.12 98921 208504 30096 82256 40161 51223 24729 77368 102097 62597 17468 12070 30241 44178 268099 0.474432 0.365882 0.784042 0.394506 1.285822 0.304243 0.245669 0.575728 2.069938
In [26]:
change_per48_df = change_df.copy()
In [27]:
change_per48_df.columns[2:18]
Out[27]:
Index(['POSS_est', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')
In [28]:
change_per48_df = change_df.copy()
for col in change_per48_df.columns [2:18]:
    change_per48_df[col] = (change_per48_df[col]/change_per48_df['MIN'])*48*5

change_per48_df.drop(columns='MIN', inplace=True)

fig = go.Figure()
for col in change_per48_df.columns[1:]:
    fig.add_trace(go.Scatter(x=change_per48_df['season_start_year'],
                             y=change_per48_df[col], name=col))

fig.show()
In [29]:
print(change_per48_df.columns)
Index(['season_start_year', 'POSS_est', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM',
       'FTA', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'FG%', '3PT%', 'FT%', 'FG3A%', 'PTS/FGA', 'FG3M/FGM', 'FTA/FGA', 'TRU%',
       'AST_TOV'],
      dtype='object')
In [30]:
change_per100_df = change_df.copy()

for col in change_per100_df.columns [3:18]:
    change_per100_df[col] = (change_per100_df[col]/change_per100_df['POSS_est'])*100

change_per100_df.drop(columns=['MIN','POSS_est'], inplace=True)

fig = go.Figure()
for col in change_per100_df.columns[1:]:
    fig.add_trace(go.Scatter(x=change_per100_df['season_start_year'],
                             y=change_per100_df[col], name=col))

fig.show()
In [ ]: